package crawler;

import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
 
public class HTMLLinkExtrator{
 
	  private Pattern patternTag, patternLink;
	  private Matcher matcherTag, matcherLink;
 
	  private static final String HTML_A_TAG_PATTERN = 
                      "(?i)<a([^>]+)>(.+?)</a>";
 
	  private static final String HTML_A_HREF_TAG_PATTERN = 
                      "\\s*(?i)href\\s*=\\s*(\"([^\"]*\")|'[^']*'|([^'\">\\s]+))";
 
	  public HTMLLinkExtrator(){
		  patternTag = Pattern.compile(HTML_A_TAG_PATTERN);
		  patternLink = Pattern.compile(HTML_A_HREF_TAG_PATTERN);
	  }
 
	  /**
	   * Validate html with regular expression
	   * @param html html content for validation
	   * @return Vector links and link text
	   */
	  public ArrayList<HtmlLink> grabHTMLLinks(final String html){
 
		  ArrayList<HtmlLink> result = new ArrayList<HtmlLink>();
 
		  matcherTag = patternTag.matcher(html);
 
		  while(matcherTag.find()){
 
			  String href = matcherTag.group(1); //href
			  String linkText = matcherTag.group(2); //link text
 
			  matcherLink = patternLink.matcher(href);
 
			  while(matcherLink.find()){
 
				  String link = matcherLink.group(1); //link
				  
				  result.add(new HtmlLink(link.replace("'", ""), linkText));
 
			  }
 
		  }
 
		  return result;
 
	  }
 
	class HtmlLink {
 
		String link;
		String linkText;
 
		HtmlLink(String link, String linkText){
			this.link = link;
			this.linkText = linkText;
		}
 
		@Override
		public String toString() {
			return "Link : " + this.link + "\n" + "Link Text : " + this.linkText + '\n'; 
		}	    
	
	}
	
	public static void main(String[] args)
	{
		HTMLLinkExtrator htmlLinkExtrator = new HTMLLinkExtrator();
		String html = 	"abc hahaha <a href='http://www.google.com'>google</a>" + 
						"abc hahaha <a HREF='http://www.google.com'>google</a>" + 
						"abc hahaha <A HREF='http://www.google.com'>google</A> , " +
						"abc hahaha <A HREF='http://www.google.com' target='_blank'>google</A>" +
						"abc hahaha <A HREF='http://www.google.com' target='_blank'>google</A>" +
						"abc hahaha <A target='_blank' HREF='http://www.google.com'>google</A>" +
						"abc hahaha <a HREF=http://www.google.com>google</a>";
		
		
		ArrayList<HtmlLink> links  = htmlLinkExtrator.grabHTMLLinks(html);
		for(HtmlLink link : links)
		{
			System.out.println(link.toString());
		}
	}
}